package com.mw.util;

import com.mw.domain.News;

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class Spider {
    public static String SendGet(String url) {
        // 定义一个字符串用来存储网页内容
        String result = "";
        // 定义一个缓冲字符输入流
        BufferedReader in = null;

        try {
            // 将string转成url对象
            URL realUrl = new URL(url);
            // 初始化一个链接到那个url的连接
            URLConnection connection = realUrl.openConnection();
            // 开始实际的连接
            connection.connect();
            // 初始化 BufferedReader输入流来读取URL的响应
            in = new BufferedReader(new InputStreamReader(
                    connection.getInputStream(), "UTF-8"));
            // 用来临时存储抓取到的每一行的数据
            String line;
            while ((line = in.readLine()) != null) {
                // 遍历抓取到的每一行并将其存储到result里面
                result += line;
            }
        } catch (Exception e) {
            System.out.println("发送GET请求出现异常！" + e);
            e.printStackTrace();
        }
        // 使用finally来关闭输入流
        finally {
            try {
                if (in != null) {
                    in.close();
                }
            } catch (Exception e2) {
                e2.printStackTrace();
            }
        }
        return result;

    }

    public static ArrayList<News> GetNews(String content) {
        // 预定义一个ArrayList来存储结果
        ArrayList<News> results = new ArrayList<News>();
        // 用来匹配标题
        Pattern titlePattern = Pattern.compile("<h3.+?title=\"(.+?)\"\\st");
        Matcher titleMatcher = titlePattern.matcher(content);
        // 用来匹配url，也就是内容的链接
        Pattern urlPattern = Pattern.compile("<h3.+?href=\"//(.+?)\"");
        Matcher urlMatcher = urlPattern.matcher(content);

        // 标题和链接要均能匹配到
        boolean isFind = titleMatcher.find() && urlMatcher.find();

        while (isFind) {
            // 定义一个新闻对象来存储抓取到的信息
            News news = new News();
            news.setTitle(titleMatcher.group(1));
            String newsUrl =urlMatcher.group(1);
            String context = Spider.SendGet("http://" + newsUrl);
            //抓作者
            Pattern authorPattern = Pattern.compile("•\\s<span>(.+?)<");
            Matcher authorMatcher = authorPattern.matcher(context);
            //抓正文
            Pattern contextPattern = Pattern.compile("hb_content\">\\s+(.+?)</div");
            Matcher contextMatcher = contextPattern.matcher(context);
            //抓类型
            //Pattern typePattern = Pattern.compile("-\\s(..)\\s-");
            //Matcher typeMatcher = typePattern.matcher(context);
            while(authorMatcher.find()&&contextMatcher.find()){
                if(authorMatcher.group(1).equals("</span>")){
                    news.setAuthor("无");
                }else{
                    news.setAuthor(authorMatcher.group(1));
                }
                news.setContext(contextMatcher.group(1));
            }
            // 添加成功匹配的结果
            results.add(news);
            //System.out.println("添加成功");
            // 继续查找下一个匹配对象
            isFind = titleMatcher.find() && urlMatcher.find();
        }

        return results;
    }

}
